Windows and entropies

Words per book



In [1]:

    
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
import statsmodels



In [2]:

    
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf



In [3]:

    
someBooks, _ = aBookCollection.exclude_authors_with_less_than(10).sample_authors(5).split_at_number_per_author(10)
aPossibleFeatureAnalyzer = bc.PossibleFeatureAnalyzer.from_book_collection(someBooks)



In [4]:

    
freqDf = aPossibleFeatureAnalyzer.frequencies().dataframe_total()



In [5]:

    
freqDf.apply(numpy.log10).plot(kind='kde')









    Out[5]:





<matplotlib.axes.AxesSubplot at 0x7f8d53fba2d0>



In [6]:

    
import statsmodels.graphics.gofplots as gp
import scipy.stats

_ = gp.qqplot(freqDf.Value.apply(numpy.log10), scipy.stats.distributions.uniform())



In [7]:

    
entrDf = aPossibleFeatureAnalyzer.entropies().dataframe_total()
entrDf.plot(kind='kde')









    Out[7]:





<matplotlib.axes.AxesSubplot at 0x7f8d536e52d0>



In [8]:

    
_ = gp.qqplot(entrDf.Value, scipy.stats.distributions.uniform())



In [9]:

    
#blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0.35, 1)
blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0, 1)



In [10]:

    
freqDf2 = blah.frequencies().dataframe_total()



In [11]:

    
freqDf2.apply(numpy.log10).plot(kind='kde')









    Out[11]:





<matplotlib.axes.AxesSubplot at 0x7f8d50e8ba50>



In [12]:

    
_ = gp.qqplot(freqDf2.Value.apply(numpy.log10), scipy.stats.distributions.uniform())



In [13]:

    
entrDf2 = blah.entropies().dataframe_total()



In [14]:

    
entrDf2.plot(kind='kde')









    Out[14]:





<matplotlib.axes.AxesSubplot at 0x7f8d539f8f50>



In [15]:

    
entrDf2.sort('Value').head(10)









    Out[15]:






  
    
      
      Value
    
  
  
    
      ides
      -0
    
    
      sabres
      -0
    
    
      cables
      -0
    
    
      ites
      -0
    
    
      contralto
      -0
    
    
      hiking
      -0
    
    
      entertainments
      -0
    
    
      trifler
      -0
    
    
      predilection
      -0
    
    
      pacers
      -0



In [16]:

    
plt.figsize(10, 6)
entrPnl2 = blah.entropies().dataframe_authors()
entrPnl2.hist()









    Out[16]:





array([[<matplotlib.axes.AxesSubplot object at 0x7f8d50d10210>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d5373af50>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d53440190>],
       [<matplotlib.axes.AxesSubplot object at 0x7f8d536ab650>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d505f3ed0>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d53561290>]], dtype=object)



In [17]:

    
freqPnl2 = blah.frequencies().dataframe_authors()
freqPnl2.hist(log=True)









    Out[17]:





array([[<matplotlib.axes.AxesSubplot object at 0x7f8d535583d0>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d534a3310>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d53464f10>],
       [<matplotlib.axes.AxesSubplot object at 0x7f8d534990d0>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d50896950>,
        <matplotlib.axes.AxesSubplot object at 0x7f8d5083bcd0>]], dtype=object)



In [18]:

    
mydata = []
df = blah.entropies().dataframe_authors()
for col in df:
    arr = [df[col].dropna().quantile(v/50) for v in range(50)]
    mydata.append(arr)



In [19]:

    
import statsmodels.api as sm
sm.graphics.fboxplot(mydata)









    Out[19]:





(<matplotlib.figure.Figure at 0x7f8d5053a610>,
 array([ 0.686,  0.712,  0.808,  0.802,  0.654]),
 array([2, 3, 1, 0, 4]),
 array([0, 1, 4]))



In [ ]:

	Value
ides	-0
sabres	-0
cables	-0
ites	-0
contralto	-0
hiking	-0
entertainments	-0
trifler	-0
predilection	-0
pacers	-0